################################################################################
##
## Purpose: This script uses RSelenium to scrape information on politicians from
##          https://api.govinfo.gov/. It is included for reference, but is not 
##          part of the replication materials due to the reliance on an API key.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##  - Outputs:
##    - ./data/raw/demographics/demographics.csv
##
################################################################################


rm(list = ls())
gc()
require(rvest)
require(RSelenium)
require(tidyverse)
require(xml2)
require(httr)

set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


api <- '' # Enter API key here
docs <- GET(paste0('https://api.govinfo.gov/collections/CDIR/1997-01-01T00%3A00%3A00Z?offset=0&pageSize=100&api_key=',api))
docsRes <- content(docs,'parsed')
docsRes$packages
demogs <- NULL
for(k in 1:length(docsRes$packages)) {
  Sys.sleep(1)
  id <- docsRes$packages[[k]]$packageId
  cong <- docsRes$packages[[k]]$congress
  if(id %in% demogs$packId) { next }
  gran <- GET(paste0('https://api.govinfo.gov/packages/',id,'/granules?granuleClass=CONGRESSMEMBERSTATE&offset=0&pageSize=1000&api_key=',api))
  granRes <- content(gran,'parsed')
  
  dems <- NULL
  for(j in 1:length(granRes$granules)) {
    if(!grepl('Representative|Senat',granRes$granules[[j]]$title)) {
      next
    }
    title <- granRes$granules[[j]]$title
    granId <- granRes$granules[[j]]$granuleId
    if(granId %in% demogs$granId) { next }
    Sys.sleep(1)
    deets <- GET(paste0('https://api.govinfo.gov/packages/',id,'/granules/',granId,'/htm?api_key=',api))
    
    res <- content(deets,'parsed')
    node_list <- xml_contents(res)
    nd <- as_list(node_list[[2]])
    if(!is.null(nd$font)) {
      while(grepl('server requested closed',nd$font$b[[1]])) {
        Sys.sleep(5)
        deets <- GET(paste0('https://api.govinfo.gov/packages/',id,'/granules/',granId,'/htm?api_key=',api))
        
        res <- content(deets,'parsed')
        node_list <- xml_contents(res)
        nd <- as_list(node_list[[2]])
      }
    }
    toLook <- nd$pre
    for(i in 1:length(toLook)) {
      if(grepl('born in|education|professional|family|children',toLook[[i]])) {
        str_split(toLook[[i]],':')
        name <- trimws(gsub('\\\n\\\n\\s{2,}|,','',str_extract(toLook[[i]],'\\\n\\\n\\s{2,}[A-Z]+.*?,')))
        born <- trimws(gsub('born in |;','',str_extract(gsub('\\\n','',toLook[[i]]),'born in .*?;')))
        educ <- trimws(gsub('education: |professional','',str_extract(gsub('\\\n','',toLook[[i]]),'education:.*?:')))
        prof <- trimws(gsub('professional:|public service','',str_extract(gsub('\\\n','',toLook[[i]]),'professional:.*?:')))
        pubSrv <- trimws(gsub('public service:|family','',str_extract(gsub('\\\n','',toLook[[i]]),'public service:.*?:')))
        fam <- trimws(gsub('family:|children','',str_extract(gsub('\\\n','',toLook[[i]]),'(family:|married:).*?:')))
        child <- trimws(gsub('children:|committees','',str_extract(gsub('\\\n','',toLook[[i]]),'children:.*?:')))
        dems <- bind_rows(dems,as_tibble(data.frame(name = name,born = born,educ = educ,
                                                    prof = prof,pubSrv = pubSrv,fam = fam,child = child,
                                                    granId = granId,packId = id,title = title,cong = cong)))
      }
    }
    cat(granId,'\n')
  }
  demogs <- bind_rows(demogs,dems)
  cat(id,'\n')
}
unique(demogs$packId)[order(unique(demogs$packId))]
write.csv(demogs,file = './data/raw/demographics/demographics.csv')

# EOF